in this document I create a new version for the flowcam classifier at 18 degrees and 30% light in early Feb 2022. Crypto is only kept at lower light and some classes (airbubbles, debris, etc.) have not been remearured.

0.1 Dependencies

library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)

set.seed(1)

0.2 Data

0.2.1 new data

colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
              "Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
              "Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
              "Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
              "Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
              "Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
              "Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
              "Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
              "Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
              "Image_Width","Image_X","Image_Y","Intensity","Length",
              "Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
              "Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
              "Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
              "Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")

Chlamydomonas <- read_csv("Data/Chlamydomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"

Cosmarium <- read_csv("Data/Cosmarium_dark_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"

Desmodesmus <- read_csv("Data/Desmodesmus_dark_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"

Monoraphidium <- read_csv("Data/Monoraphidium_dark_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"

Staurastrum1 <- read_csv("Data/Staurastrum1_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("Data/Staurastrum2_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"

Cryptomonas <- read_csv("Data/Cryptomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"

dd_all_feb22 <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
                      Monoraphidium,Staurastrum1,Staurastrum2,
                      Cryptomonas) 

colnames(dd_all_feb22) <- colnames

0.2.2 old data and merge

dd_all <- read_csv("../Class_18C_normalLight/1st_class_2020/Data/libraries_FC100/dd_all.csv", show_col_types = FALSE) %>%
  dplyr::filter(!(species %in% c("Chlamydomonas","Cosmarium","Desmodesmus",
                                 "Monoraphidium","Staurastrum1","Staurastrum2",
                                 "Cryptomonas")))

dd_all <- rbind(dd_all_feb22,dd_all) %>%
  dplyr::select(-Date)
table(dd_all$species)

0.3 filtering out outliers

dd_all$id <- 1:nrow(dd_all)
dd_all_long <- dd_all %>%
  dplyr::select(species, Area_ABD,Area_Filled,Aspect_Ratio,Average_Blue,
                Average_Green,Average_Red,Circle_Fit,Circularity,
                Compactness,Convexity,Diameter_ABD,Diameter_ESD,Edge_Gradient,
                Elongation,Feret_Angle_Max,Feret_Angle_Min,Fiber_Curl,
                Fiber_Straightness,Geodesic_Aspect_Ratio,Intensity,
                Length,Perimeter,Ratio_Blue_Green,Ratio_Red_Blue,
                Ratio_Red_Green,Roughness,Sigma_Intensity,Sum_Intensity,
                Symmetry,Transparency,Width, id) %>%
  pivot_longer(cols=-c(id,species), names_to="variable") %>%
  dplyr::group_by(variable,species) %>%
  mutate(iqr = IQR(value),
         first_quartile = quantile(value, probs = 0.25),
         third_quartile = quantile(value, probs = 0.75),
         outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))



outliers <- dd_all_long %>%
  dplyr::filter(outlier==T) %>%
  dplyr::group_by(species, id) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n>6)
## `summarise()` has grouped output by 'species'. You can override using the `.groups` argument.
table(outliers$species)
## 
##            airbubbles         Chlamydomonas   ChlamydomonasClumps 
##                    15                   111                     9 
##         Coleps_irchel             Colpidium     ColpidiumVacuoles 
##                    39                    11                    31 
##             Cosmarium           Cryptomonas                Debris 
##                     6                    33                    10 
##           Desmodesmus            Dexiostoma         DigestedAlgae 
##                    76                    18                     5 
## DividingChlamydomonas         Loxocephallus         Monoraphidium 
##                    13                    18                    76 
##          OtherCiliate    Small_unidentified          Staurastrum1 
##                     1                   219                     3 
##          Staurastrum2           Tetrahymena 
##                     7                     9
nrow(outliers)/nrow(dd_all)
## [1] 0.04188791
dd_all_filtered <- dd_all %>%
  dplyr::filter(!is.element(id,outliers$id))

dd_all$removed_outliers <- F
dd_all_filtered$removed_outliers <- T

dd_all_comparison <- rbind(dd_all,dd_all_filtered)


dd_all_comparison %>% 
  ggplot(aes(log10(Area_ABD), col=removed_outliers))+
  geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
  geom_boxplot(outlier.alpha = 0.3) +
  theme_bw() +
  facet_wrap(~species, scales = "free_y")  +
  geom_vline(xintercept = 1, col="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dd_all_filtered %>%
  ggplot(aes(log10(Area_ABD)))+
  geom_density(aes(col=species))

0.4 Species compositions

species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                                         "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))]
compositions <- read_csv("../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- comp_name

0.5 Classifiers

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}
formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
      Average_Blue + Average_Green + Average_Red + Circle_Fit +
      Circularity + Compactness +
      Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
      Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
      Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
      Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
      Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
      Width

set.seed(62)
classifiers_18c <- list()
classifiers_18c_plots <- list()
classifiers_18c_assess_plots <- list()

trainingdata <- dd_all[complete.cases(dd_all), ]


for(i in 1:length(compositions_list)){
  
  if("Colpidium" %in% compositions_list[[i]]) {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
                            "OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
  } else {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
                            "DigestedAlgae","DividingChlamydomonas"))
  }

  n.var <- length(unique(sub_trainingdata$species))
  sub_trainingdata$species <- factor(sub_trainingdata$species)
  
  split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  subsamples <- lapply(split_up, function(x) {
    x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  sub_trainingdata <- do.call("rbind", subsamples)
  
  # table(sub_trainingdata$species)
  # A stratified random split of the data
  idx_train <- createDataPartition(sub_trainingdata$species,
                                   p = 0.7, # percentage of data as training
                                   list = FALSE)
  
  sub_testdata <- sub_trainingdata[-idx_train,]
  sub_trainingdata <- sub_trainingdata[idx_train,]
  
  n.min <- min(table(sub_trainingdata$species))
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}

names(classifiers_18c_plots) <- names(classifiers_18c) <-
  names(classifiers_18c_assess_plots) <- comp_name
library(patchwork)
for(i in 1:length(classifiers_18c_plots)){
  print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] + 
    plot_layout(widths = c(7,3)))
}

0.6 Save classifiers

saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c_decrLight18_trained_earlyFeb22.rds")
# cl <- readRDS("classifiers_18c_25x.rds")
# labels <- dd_all %>% 
#   group_by(species) %>%  
#   summarise(xPos = max(Area_Filled),
#             yPos = max((density(Area_Filled))$y))
# 
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))